2  Data analysis

Author

Izar de Villasante

2.1 Sample sheet

The correspondence between the methylome and transcriptome names are:

- vector with FULL protein (without doxy): "Tat" (RNAseq [Tat_1, Tat_2, Tat_3]) = "TAT_NO" (microarrays [HC_SEN 3,9,14])
- vector with PARTIAL protein (without doxy):"Tat72" (RNAseq [Tat72_1, Tat72_2, Tat72_3]) = "Exon1_No" (microarrays [HC_SEN 5,11,16]) 

- vector with FULL protein (with doxy): "TatDOX" (RNAseq [TatDOX_1, TatDOX_2, TatDOX_3]) = "TAT_Dox" (microarrays [HC_SEN 4,10,15])
- vector with PARTIAL protein (with doxy): "Tat72DOX" (RNAseq [Tat72DOX_1, Tat72DOX_2, Tat72DOX_3]) = "Exon1_Dox" (microarrays [HC_SEN 6,12])

- vector without protein (empty): "TetOFF" (RNAseq [TetOFF_1, TetOFF_2, TetOFF_3]) = "Control_NO" (microarrays [HC_SEN 1,7,13])
- vector without protein (empty) + doxy: "TetOFFDOX" (RNAseq [TetOFFDOX_1, TetOFFDOX_2, TetOFF_3]) = "Control_Dox" (microarrays [HC_SEN 2,8])
dtable(ss)

2.2 Expression:

2.2.1 Read counts:

namedlist <- function(...) {
  nl <- list(...)
  argnames <- sys.call()
  n<-sapply(argnames[-1], as.character)
  names(nl)<-n

  
  return(nl)
  }

library(readxl)
library(data.table)
counts<-list()
Tat72_off <- readxl::read_xlsx("data-raw/Transcriptome_HClinic_Vcasanova_20042023/Tat72VSTetOFF.xlsx", sheet = 1)
New names:
• `` -> `...1`
colnames(Tat72_off) <- c("gene",sapply(colnames(Tat72_off),function(x)unlist(strsplit(x,"\\."))[7])[-1])
setDT(Tat72_off)
Tat72_off[,ID :=tstrsplit(gene,"\\.",keep=1)]

Tat_Tat72 <- readxl::read_xlsx("data-raw/Transcriptome_HClinic_Vcasanova_20042023/TatVSTat72.xlsx", sheet = 1)
New names:
• `` -> `...1`
colnames(Tat_Tat72) <- c("gene",sapply(colnames(Tat_Tat72),function(x)unlist(strsplit(x,"\\."))[7])[-1])
setDT(Tat_Tat72)
Tat_Tat72[,ID :=tstrsplit(gene,"\\.",keep=1)]


Tat_off <- readxl::read_xlsx("data-raw/Transcriptome_HClinic_Vcasanova_20042023/TatVSTetOFF.xlsx", sheet = 1)
New names:
• `` -> `...1`
colnames(Tat_off) <- c("gene",sapply(colnames(Tat_off),function(x)unlist(strsplit(basename(x),"\\."))[1])[-1])
setDT(Tat_off)
Tat_off[,ID :=tstrsplit(gene,"\\.",keep=1)]

counts <-namedlist(Tat72_off,Tat_Tat72,Tat_off)
dtable(counts[[1]])
Warning in instance$preRenderHook(instance): It seems your data is too big for
client-side DataTables. You may consider server-side processing:
https://rstudio.github.io/DT/server.html

2.3 Differentially Expressed Genes (DEGs):

library(readxl)
library(data.table)

Tat72_off <- readxl::read_xlsx("data-raw/Transcriptome_HClinic_Vcasanova_20042023/Tat72VSTetOFF.xlsx", sheet = 3)
Tat72_off$Contrast="Tat72_off"
Tat_Tat72 <- readxl::read_xlsx("data-raw/Transcriptome_HClinic_Vcasanova_20042023/TatVSTat72.xlsx", sheet = 3)
Tat_Tat72$Contrast="Tat_Tat72"
Tat_off <- readxl::read_xlsx("data-raw/Transcriptome_HClinic_Vcasanova_20042023/TatVSTetOFF.xlsx", sheet = 3)
Tat_off$Contrast="Tat_off"
dt_DEGs <- data.table::data.table(rbind(Tat_Tat72,Tat72_off,Tat_off))
# dt_DEGs <- dt_DEGs[RealFC<0.5 | RealFC >2,]
dtable(dt_DEGs)
Warning in instance$preRenderHook(instance): It seems your data is too big for
client-side DataTables. You may consider server-side processing:
https://rstudio.github.io/DT/server.html

2.4 Methylation:

Email

“El primer objetivo es ver si la proteína TAT del VIH, induce cambios en el metiloma. La proteína TAT está codificada por 2 exones que realizan funciones diferentes y queremos además aislar el efecto de cada uno, por eso hemos generado unas líneas celulares de jurkat establemente transfectadas con:

  1. Un vector vacío que será nuestro control: muestras 1, 7 y 13 (triplicados)
  2. Un vector con la proteína TAT completa: muestras 3, 9 y 14 (triplicados)
  3. Un vector con solo el primer exón de TAT: muestras 5, 11 y 16 (triplicados)

La comparativa entre estos tres grupos es la primera que nos interesa. a-b, a-c y b-c.”

Por otro lado, la DOXYCICLINA “apaga” la expresión de TAT y el segundo objetivo entonces es ver si el “apagado” de TAT con la DOXY hace algún cambio en el metiloma. Por eso la comparativa aquí sería cada una de las tres líneas anteriores con y sin DOXY, así que tendremos además las siguientes muestras:

  1. Un vector vacío que será nuestro control + DOXY: muestras 2 y 8 (duplicados)
  2. Un vector con con la proteína TAT completa + DOXY: muestras 4, 10 y 15 (triplicados)
  3. Un vector con solo el primer exón de TAT + DOXY: muestras 6 y 12 (duplicados)

La segunda comparativa que nos interesa, por tanto, es. a-d, b-e y c-f.

While the first objective yielded positive results no difference was found for the second (Doxi) apporach.

2.4.1 PCA plot:

PCR plots:

PCA

2.4.2 DMRS:

dmrs<-readRDS("data/dmrs.rds")
ano_cols<-c("seqnames","start","end","width","no.cpgs","HMFDR","meandiff","Contrast")
dt_list<- lapply(
  1:NROW(dmrs),function(rn){  # For every position or ProbeID do:     
    UCSC <- data.table(      # 1. generate a data.table (per row)
      dmrs[rn,       # 2. split UCSC gene and position    
               lapply(       # 3. new row for every combination of gene/position
                 overlapping.genes,        # 4. add Contrast porbeID and bval difference
                 function(x)unlist(strsplit(x,","))
                 )
               ,by=ano_cols]
      )
    })
dt_dmrs <- rbindlist(dt_list)
DMRs <- dt_dmrs[,Gene:=V1][!is.na(V1),]
DMRs$V1 <- NULL
# c1<-DMRs[Contrast== "Control_No-Exon1_No",.(.SD,meandiff=-meandiff,Contrast="Tat72_off"),.SDcols=ano_cols[!ano_cols %in% c("meandiff","Contrast")]]
c1<-DMRs[Contrast== "Control_No-Exon1_No",]
c1$Contrast <- "Tat72_off"
c1$meandiff <- -c1$meandiff

c2<-DMRs[Contrast== "Control_No-TAT_No",]
c2$Contrast <- "Tat_off"
c2$meandiff <- -c2$meandiff

c3<-DMRs[Contrast== "Exon1_No-TAT_No",]
c3$Contrast <- "Tat_Tat72"
c3$meandiff <- -c3$meandiff
DMRs<-rbind(c1,c2,c3)
saveRDS(DMRs,"data/DMRs.rds")
dtable(DMRs)

2.5 DMRs vs expression

First, we intersect all differentially expressed genes and genes associated with DMR.

dt_DEG_DMR <- merge(DMRs,dt_DEGs,by=c("Gene","Contrast"),all = T)
tab <- dt_DEG_DMR[,.(DEGs=sum(!is.na(ID)),DMRs=sum(!is.na(start)),overlap=sum(!(is.na(start)) &!(is.na(ID)) )),by=c("Contrast")]
kableExtra::kbl(tab)|>kableExtra::kable_classic_2()
Contrast DEGs DMRs overlap
Tat_Tat72 3087 1092 224
Tat_off 3102 990 207
Tat72_off 1594 53 8

Then we get the mean beta value for each sample for each dmr location:

  1. Find all cpg that fall in the dmr

  2. get the mean for each sample

  3. Match each sample bVal with it’s expression value. (No way to do it, since there is no key sample to sample)

  4. Make correlation

  5. Plot.

ol <- dt_DEG_DMR[!(is.na(start)) &!(is.na(ID)),]
library(data.table)
library(GenomicRanges)
library(IlluminaHumanMethylationEPICanno.ilm10b2.hg19)

locs <- IlluminaHumanMethylationEPICanno.ilm10b2.hg19::Locations
locs.ranges <- GRanges(locs$chr, IRanges(locs$pos, locs$pos))
names(locs.ranges) <- rownames(locs)
locs.ranges$gene_element <- IlluminaHumanMethylationEPICanno.ilm10b2.hg19::Other$UCSC_RefGene_Group
locs.ranges$gene <- IlluminaHumanMethylationEPICanno.ilm10b2.hg19::Other$UCSC_RefGene_Name
dt_DEG_DMR_intersect<-lapply(1:NROW(ol),function(nr){
  cont<-ol[nr,Contrast]
  xpr <- counts[[cont]][ID==ol[nr,ID]]
  xpr<-xpr[,.SD,.SDcols=names(xpr)[!(names(xpr)%in%c("ID","gene"))]]
  expression <- suppressWarnings(melt(xpr,measure.vars=colnames(xpr),variable.name = "SID",value.name = "TPM" ))
  SIDs <- as.character(expression$SID)
  cgs <- names(subsetByOverlaps(locs.ranges,GRanges(ol[nr,])))
  #### Annotate DMRs with genomic location:
  
  ann<-as.data.table(mcols( subsetByOverlaps(locs.ranges,GRanges(ol[nr,]))))

  # Possibility of multiple genes and genomic location per probe
  genomic<-lapply(
  1:NROW(ann),function(cg){  # For every position or ProbeID do:
    UCSC <- data.table(      # 1. generate a data.table (per row)
      ann[cg,       # 2. split UCSC gene and position
               lapply(       # 3. new row for every combination of gene/position
                 .SD,        # 4. add Contrast porbeID and bval difference
                 function(x)unlist(strsplit(x,";"))
                 )
               
               ,.SDcols=c("gene_element","gene")]
      )
    })
  # 5. rbind all together:
  genomic_loc <- rbindlist(genomic)|> unique() # should get gene annotation from here to be fair
  ge <- factor(genomic_loc$gene_element,levels = c("3'UTR","ExonBnd", "1stExon","5'UTR","Body","TSS200","TSS1500"),ordered=T  )
  gene_location <- ifelse(startsWith( as.character(max(ge)),"TSS" ),"promoter","intragenic")
  gene_element  <- paste(genomic_loc$gene_element,collapse = ";")


  ####
  b<-betas[probeID %in% cgs ,lapply(.SD,function(x)mean(x,na.rm=T)),.SDcols=SIDs] 
  meth <- suppressWarnings( melt(b,measure.vars=colnames(b),variable.name = "SID",value.name = "DMR.meth" ))
  # list(meth,expression)
  m <- merge(meth,expression,by="SID")
  m <- m[,lapply(.SD,as.numeric),by=SID]
  dt<-cbind(ol[nr,],m)
  cor<-cor.test(dt$DMR.meth,dt$TPM)
  dt$correlation <- cor$estimate
  dt$cor.pval <- cor$p.value
  dt$gene_element <- gene_element
  dt$gene_location <- gene_location
  return(dt)
  })|>rbindlist()
dt_DEG_DMR_intersect[,grp:=tstrsplit(SID,"_",keep = 1)]
dt_cor <-dt_DEG_DMR_intersect
setorder(dt_cor,cor.pval,Gene)
dt_cor <- dt_cor[,lapply(.SD,function(x)if(is.character(x))as.factor(x)else(x))]
saveRDS(dt_cor,"data/dt_cor.rds")

2.5.0.1 Table for DMR & DEG:

In the following table the values for methylation and expression are integrated:

dtable(dt_cor[,.SD,.SDcols=c("Gene","Contrast","SID","DMR.meth","TPM","correlation","cor.pval")])

2.5.1 Plots

library(ggplot2)
library(data.table)
library(ggpubr)
library(ggrepel)
DMR_DEG_plots <- list()
plots_folder <- "analysis/Integration_with_expression/DEGvsDMR/pval"
dir.create(plots_folder,recursive = T,showWarnings = F)
setkeyv(dt_cor,c("cor.pval","Gene","width"))

for(cont in unique(dt_cor$Contrast)){
  dt<-head(dt_cor[Contrast==cont,],60)
  lg <- dt[,.(gene=unique(Gene),width=unique(width)),by=cor.pval] 
  
  plist <- lapply(1:NROW(lg),function(nr){
    pdata <- dt_cor[lg[nr,]]
    plot_name <- paste0("/DEG_DMR.pval_", nr, ".png")
    plt_path <- paste(plots_folder, cont, sep = .Platform$file.sep) 
    dir.create(plt_path,recursive = T,showWarnings = F)
    p<-ggplot(data=pdata, aes(x=as.numeric(TPM),y=as.numeric(DMR.meth),colour=SID)) +
    geom_point(aes(shape=grp),size=8,alpha=0.6) +
    xlab("Mean DMR methylation") + ylab("Transcripts x Milion") +

    geom_smooth(method = "lm",
                inherit.aes = F,
                aes(x=as.numeric(pdata$TPM),y=as.numeric(pdata$DMR.meth)),
                linetype="dashed",
                se=F,
                formula = "y ~ x")+
    
    annotate(geom = "text",
             x = -Inf, y = Inf,
             label=paste0("correlation: ", round(unique(pdata$correlation),4), ", p-value: ", round(unique(pdata$cor.pval),8)),
             hjust = 0, vjust = 1
             )  +

    ggtitle(paste0(" Correlation between expression and methylation for gene: ",unique(pdata$Gene)))
    ggsave(path = plt_path, filename = plot_name, plot=p)
    return(p)
  })
  DMR_DEG_plots[[cont]] <- plist
  }
saveRDS(DMR_DEG_plots,"data/DMR_DEG_plots.rds")

2.5.1.1 Tat_Tat72

DMR_DEG_plots[["Tat_Tat72"]]

2.5.1.2 Tat_off

DMR_DEG_plots[["Tat_off"]]

2.5.1.3 Tat72_off

DMR_DEG_plots[["Tat72_off"]]

The pathways analysis will be performed analysing 2 sets of genes: 1. Full DMR/DMPs: the full set of significant diferentially methylated & expressed genes. 2. Cor.sig: A subset of the above where the correlation p.value is smaller than 0.05

2.6 DNA methylation context:

Although the relationship between DNA methylation and gene expression is complex and the mechanisms involved in gene regulation by DNA methylation are diverse. Fortunately, over the years researchers have found some common relationships between gene methylation and gene expression:

DNA methylation is an epigenetic mark that has been traditionally associated with gene silencing, specially when methylation happens on promoter regions. DNA methylation is related with the repressed chromatin state which blocks the access of transcription factors to promoter regions. Thus, silencing promoter activity in a stable way and reducing transcription Suzuki and Bird (2008).

In the other hand, the bodies of active genes in plants and animals are often heavily methylated. Suzuki and Bird (2008)

Let’s inspect how is the relationship of methylation with expression in our data. the following table shows a summary of the paired DMGs and DEGs:

tab.full <- dt_cor[,.(negative.cor=sum(correlation<0)/6,positive.cor=sum(correlation>0)/6,Total=sum(correlation!=0)/6),by=c("Contrast")]
kableExtra::kbl(tab.full)|>kableExtra::kable_classic_2()
Contrast negative.cor positive.cor Total
Tat_off 84 123 207
Tat_Tat72 106 118 224
Tat72_off 3 5 8
Warning

Be aware that DMGs are based on DMRs. More than one DMR can be mapped to the same gene. Therefore, there can be more than one pair of DMG/DEG for the same gene as it happens for LTBP1 gene in Tat72_off contrast.

Contrast seqnames start end no.cpgs width gene_element gene_location Gene
Tat72_off chr13 67551364 67551521 3 158 TSS200;Body;Body promoter PCDH9
Tat72_off chr3 15311021 15311269 4 249 Body;5’UTR intragenic SH3BP5
Tat72_off chr5 172070871 172072282 3 1412 Body intragenic NEURL1B
Tat72_off chr3 116163694 116164943 13 1250 5’UTR;1stExon;TSS200;Body;TSS1500 promoter LSAMP
Tat72_off chr4 157713603 157714561 3 959 Body intragenic PDGFC
Tat72_off chrY 21877683 21878594 3 912 Body;ExonBnd intragenic KDM5D
Tat72_off chr2 33495871 33495908 3 38 Body intragenic LTBP1
Tat72_off chr2 33216742 33217280 3 539 Body intragenic LTBP1

In the case of our data there are more positive than negative correlations. You can inspect the data in the following table:

dtable(dt_cor[,.SD,.SDcols=c("Gene","Contrast","width","SID","DMR.meth","TPM","correlation","cor.pval","gene_element", "gene_location")])

2.7 Full

This dataset is not filtered by the strength of the correlation between DMRs and DEGs.

2.7.1 Pathways

path_results<-function(pathway,topN=50,method="method",pval=0.05,path="results/pathways.csv",cols=NULL){
  # pathway<-pathway[FDR<1,]
  pathway$method<-pathway[[method]]
  data.table::setorder(pathway,method,FDR)
  sig_idx <- pathway[,.I[FDR < pval]  ,by=method]$V1
  head_idx<-pathway[,.I[1:min(..topN,.N)],by=c(method,"Contrast")]$V1
  res<-pathway[base::union(sig_idx,head_idx),]
  # res[,TERM:=ifelse(FDR<pval,paste0("*** ",TERM," ***"),TERM)]
  # data.table::fwrite(res,path)
  results<-res[,.SD,.SDcols=c("Contrast","FDR",cols,"TERM","method")]
  data.table::setorder(results,Contrast,method,FDR)
  return(results)
}

library(gprofiler2)
pathways_full <- lapply(unique(dt_cor$Contrast),function(cont){
  library(gprofiler2)
  library(data.table)
   p<-gprofiler2::gost(signif = T ,unique(dt_cor[Contrast==cont,Gene]))[[1]]
   if(!is.null(p)){
    dth<- data.table::as.data.table(p)
    dth[,FDR:=p_value]
    dth[,TERM:=term_name]
    dth[,source:=factor(source)]
    dth[,Contrast:=cont]
   }
  })
No results to show
Please make sure that the organism is correct or set significant = FALSE
Pathway_Full<-path_results(rbindlist(pathways_full),method="source",cols = c("term_size","query_size","intersection_size"))
saveRDS(Pathway_Full,"data/pathway_full.rds")
dtable(Pathway_Full)

As you can see the terms are rather general. Let’s try to filter based on the direction of correlation + / -:

2.7.1.1 Positive:

dt_pos <- dt_cor[correlation>0,]
pathways_full_positive <- lapply(unique(dt_pos$Contrast),function(cont){
  library(gprofiler2)
  library(data.table)
   p<-gprofiler2::gost(signif = T ,unique(dt_pos[Contrast==cont,Gene]))[[1]]
   if(!is.null(p)){
    dth<- data.table::as.data.table(p)
    dth[,FDR:=p_value]
    dth[,TERM:=term_name]
    dth[,source:=factor(source)]
    dth[,Contrast:=cont]
   }
  })
No results to show
Please make sure that the organism is correct or set significant = FALSE
Pathway_Full_Pos<-path_results(rbindlist(pathways_full_positive),method="source",cols = c("term_size","query_size","intersection_size"))
saveRDS(Pathway_Full_Pos,"data/pathway_full_positive.rds")
dtable(Pathway_Full_Pos)

2.7.1.2 Negative:

dt_neg <- dt_cor[correlation < 0,]
pathways_full_negitive <- lapply(unique(dt_neg$Contrast),function(cont){
  library(gprofiler2)
  library(data.table)
   p<-gprofiler2::gost(signif = T ,unique(dt_neg[Contrast==cont,Gene]))[[1]]
   if(!is.null(p)){
    dth<- data.table::as.data.table(p)
    dth[,FDR:=p_value]
    dth[,TERM:=term_name]
    dth[,source:=factor(source)]
    dth[,Contrast:=cont]
   }
  })
No results to show
Please make sure that the organism is correct or set significant = FALSE
Pathway_Full_neg<-path_results(rbindlist(pathways_full_negitive),method="source",cols = c("term_size","query_size","intersection_size"))
saveRDS(Pathway_Full_neg,"data/pathway_full_negative.rds")
dtable(Pathway_Full_neg)

2.8 cor.sig

This dataset is not filtered by the strength of the correlation between DMRs and DEGs.

2.8.1 Pathways

dt.sig <-  dt_cor[cor.pval<0.05 ,.(correlation=unique(correlation)),by=c("Contrast","Gene","width")]


library(gprofiler2)
pathways_full <- lapply(unique(dt.sig$Contrast),function(cont){
  library(gprofiler2)
  library(data.table)
   p<-gprofiler2::gost(signif = T ,unique(dt.sig[Contrast==cont,Gene]))[[1]]
   if(!is.null(p)){
    dth<- data.table::as.data.table(p)
    dth[,FDR:=p_value]
    dth[,TERM:=term_name]
    dth[,source:=factor(source)]
    dth[,Contrast:=cont]
   }
  })
No results to show
Please make sure that the organism is correct or set significant = FALSE
Pathway_Full<-path_results(rbindlist(pathways_full),method="source",cols = c("term_size","query_size","intersection_size"))
saveRDS(Pathway_Full,"data/pathway_full_sig.rds")
dtable(Pathway_Full)

As you can see the terms are rather general. Let’s try to filter based on the direction of correlation + / -:

2.8.1.1 Positive:

dt_pos <- dt.sig[correlation>0,]
pathways_full_positive <- lapply(unique(dt_pos$Contrast),function(cont){
  library(gprofiler2)
  library(data.table)
   p<-gprofiler2::gost(signif = T ,unique(dt_pos[Contrast==cont,Gene]))[[1]]
   if(!is.null(p)){
    dth<- data.table::as.data.table(p)
    dth[,FDR:=p_value]
    dth[,TERM:=term_name]
    dth[,source:=factor(source)]
    dth[,Contrast:=cont]
   }
  })
No results to show
Please make sure that the organism is correct or set significant = FALSE
Pathway_Full_Pos<-path_results(rbindlist(pathways_full_positive),method="source",cols = c("term_size","query_size","intersection_size"))
saveRDS(Pathway_Full_Pos,"data/pathway_full_positive_sig.rds")
dtable(Pathway_Full_Pos)

2.8.1.2 Negative:

dt_neg <- dt.sig[correlation < 0,]
pathways_full_negitive <- lapply(unique(dt_neg$Contrast),function(cont){
  library(gprofiler2)
  library(data.table)
   p<-gprofiler2::gost(signif = T ,unique(dt_neg[Contrast==cont,Gene]))[[1]]
   if(!is.null(p)){
    dth<- data.table::as.data.table(p)
    dth[,FDR:=p_value]
    dth[,TERM:=term_name]
    dth[,source:=factor(source)]
    dth[,Contrast:=cont]
   }
  })
No results to show
Please make sure that the organism is correct or set significant = FALSE
Pathway_Full_neg<-path_results(rbindlist(pathways_full_negitive),method="source",cols = c("term_size","query_size","intersection_size"))
saveRDS(Pathway_Full_neg,"data/pathway_full_negative_sig.rds")
dtable(Pathway_Full_neg)